From 4cafea04fa09012a1bb2f2941052ae606c751ced Mon Sep 17 00:00:00 2001 From: "emellor@ewan" Date: Tue, 27 Sep 2005 13:53:06 +0100 Subject: [PATCH] Detangle the restart/reboot/halt/save/restore code. There is now one point for management of shutdown state, XendDomainInfo.refreshShutdown, which is able to cope whatever the current state of the domain. This fixes bug #124: running reboot within a domU doesn't bring the domain back up after shutdown, and bug #256: "xm reboot" could not make domU reboot. Fix the refreshing inside XendDomain to ensure that the values returned by xm list reflect reality correctly. We were removing XendDomainInfo instances, but not creating them on start-up, so if xend were restarted, domain information was being lost. Merge XendDomain.dom0_unknown and initial_refresh into XendDomainInfo.recreate. Catch exceptions inside callInfo, and reraise them as XendErrors. Remove unused XendDomain.close. Further improvements are needed to XendDomainInfo to ensure that it avoids all race conditions on shutdown, and can cope correctly with xend restarting during a shutdown. Signed-off-by: Ewan Mellor --- tools/python/xen/xend/XendDomain.py | 252 ++---------- tools/python/xen/xend/XendDomainInfo.py | 504 +++++++++++++++--------- 2 files changed, 353 insertions(+), 403 deletions(-) diff --git a/tools/python/xen/xend/XendDomain.py b/tools/python/xen/xend/XendDomain.py index 3174344fc3..54faa6e33d 100644 --- a/tools/python/xen/xend/XendDomain.py +++ b/tools/python/xen/xend/XendDomain.py @@ -34,10 +34,8 @@ from xen.xend.XendError import XendError from xen.xend.XendLogging import log from xen.xend import scheduler from xen.xend.server import relocate -from xen.xend.uuid import getUuid from xen.xend.xenstore import XenNode, DBMap from xen.xend.xenstore.xstransact import xstransact -from xen.xend.xenstore.xsutil import GetDomainPath xc = xen.lowlevel.xc.new() @@ -47,14 +45,7 @@ eserver = EventServer.instance() __all__ = [ "XendDomain" ] -SHUTDOWN_TIMEOUT = 30 -PRIV_DOMAIN = 0 - -def is_dead(dom): - return dom['crashed'] or dom['shutdown'] or ( - dom['dying'] and not(dom['running'] or dom['paused'] or - dom['blocked'])) - +PRIV_DOMAIN = 0 class XendDomainDict(dict): def get_by_name(self, name): @@ -77,11 +68,10 @@ class XendDomain: # So we stuff the XendDomain instance (self) into xroot's components. xroot.add_component("xen.xend.XendDomain", self) self.domains = XendDomainDict() - self.domroot = "/domain" self.vmroot = "/domain" self.dbmap = DBMap(db=XenNode(self.vmroot)) self.watchReleaseDomain() - self.initial_refresh() + self.refresh() self.dom0_setup() def list(self): @@ -110,9 +100,7 @@ class XendDomain: return map(lambda x: x.getName(), doms) def onReleaseDomain(self): - self.reap() self.refresh() - self.domain_restarts() def watchReleaseDomain(self): from xen.xend.xenstore.xswatch import xswatch @@ -141,43 +129,22 @@ class XendDomain: dominfo = dominfo[0] return dominfo - def initial_refresh(self): - """Refresh initial domain info from db. - """ - doms = self.xen_domains() - self.dbmap.readDB() # XXX only needed for "xend" - for dom in doms.values(): - domid = dom['dom'] - dompath = GetDomainPath(domid) - if not dompath: - continue - vmpath = xstransact.Read(dompath, "vm") - if not vmpath: - continue - uuid = xstransact.Read(vmpath, "uuid") - if not uuid: - continue - log.info("recreating domain %d, uuid %s" % (domid, uuid)) - dompath = "/".join(dompath.split("/")[0:-1]) - try: - dominfo = XendDomainInfo.recreate(uuid, dompath, domid, dom) - except Exception, ex: - log.exception("Error recreating domain info: id=%d", domid) - continue - self._add_domain(dominfo) - self.reap() - self.refresh() - self.domain_restarts() + + def recreate_domain(self, xeninfo): + """Refresh initial domain info from db.""" + + dominfo = XendDomainInfo.recreate(xeninfo) + self._add_domain(dominfo) + return dominfo + def dom0_setup(self): dom0 = self.domain_lookup(PRIV_DOMAIN) if not dom0: - dom0 = self.dom0_unknown() - dom0.dom0_init_store() + dom0 = self.recreate_domain(self.xen_domain(PRIV_DOMAIN)) + dom0.dom0_init_store() dom0.dom0_enforce_vcpus() - def close(self): - pass def _add_domain(self, info, notify=True): """Add a domain entry to the tables. @@ -219,44 +186,30 @@ class XendDomain: if (domid is None) or (domid == id): domdb.delete() - def reap(self): - """Look for domains that have crashed or stopped. - Tidy them up. - """ - doms = self.xen_domains() - for d in doms.values(): - if not is_dead(d): - continue - domid = d['dom'] - dominfo = self.domains.get(domid) - if not dominfo or dominfo.is_terminated(): - continue - log.debug('domain died name=%s domid=%d', dominfo.getName(), domid) - if d['crashed'] and xroot.get_enable_dump(): - self.domain_dumpcore(domid) - if d['shutdown']: - reason = shutdown_reason(d['shutdown_reason']) - log.debug('shutdown name=%s id=%d reason=%s', - dominfo.getName(), domid, reason) - if reason == 'suspend': - dominfo.state_set("suspended") - continue - if reason in ['poweroff', 'reboot']: - self.domain_restart_schedule(domid, reason) - dominfo.destroy() def refresh(self): """Refresh domain list from Xen. """ doms = self.xen_domains() - # Remove entries for domains that no longer exist. - # Update entries for existing domains. for d in self.domains.values(): info = doms.get(d.getDomid()) if info: d.update(info) - elif not d.restart_pending(): + else: self._delete_domain(d.getDomid()) + for d in doms: + if d not in self.domains: + try: + self.recreate_domain(doms[d]) + except: + log.exception( + "Failed to recreate information for domain %d. " + "Destroying it in the hope of recovery.", d) + try: + xc.domain_destroy(dom = d) + except: + log.exception('Destruction of %d failed.', d) + def update_domain(self, id): """Update information for a single domain. @@ -281,30 +234,6 @@ class XendDomain: self._add_domain(dominfo) return dominfo - def domain_restart(self, dominfo): - """Restart a domain. - - @param dominfo: domain object - """ - log.info("Restarting domain: name=%s id=%s", dominfo.getName(), - dominfo.getDomid()) - eserver.inject("xend.domain.restart", - [dominfo.getName(), dominfo.getDomid(), "begin"]) - try: - dominfo.restart() - log.info('Restarted domain name=%s id=%s', dominfo.getName(), - dominfo.getDomid()) - eserver.inject("xend.domain.restart", - [dominfo.getName(), dominfo.getDomid(), - "success"]) - self.domain_unpause(dominfo.getDomid()) - except Exception, ex: - log.exception("Exception restarting domain: name=%s id=%s", - dominfo.getName(), dominfo.getDomid()) - eserver.inject("xend.domain.restart", - [dominfo.getName(), dominfo.getDomid(), "fail"]) - return dominfo - def domain_configure(self, config): """Configure an existing domain. This is intended for internal use by domain restore and migrate. @@ -345,33 +274,7 @@ class XendDomain: self.update_domain(id) return self.domains.get(id) - def dom0_unknown(self): - dom0 = PRIV_DOMAIN - uuid = None - info = self.xen_domain(dom0) - dompath = GetDomainPath(dom0) - if dompath: - vmpath = xstransact.Read(dompath, "vm") - if vmpath: - uuid = xstransact.Read(vmpath, "uuid") - if not uuid: - uuid = dompath.split("/")[-1] - dompath = "/".join(dompath.split("/")[0:-1]) - if not uuid: - uuid = getUuid() - dompath = self.domroot - log.info("Creating entry for unknown xend domain: id=%d uuid=%s", - dom0, uuid) - try: - dominfo = XendDomainInfo.recreate(uuid, dompath, dom0, info) - self._add_domain(dominfo) - return dominfo - except Exception, exn: - log.exception(exn) - raise XendError("Error recreating xend domain info: id=%d: %s" % - (dom0, str(exn))) - def domain_lookup(self, id): return self.domains.get(id) @@ -410,8 +313,9 @@ class XendDomain: return xc.domain_pause(dom=dominfo.getDomid()) except Exception, ex: raise XendError(str(ex)) - - def domain_shutdown(self, id, reason='poweroff'): + + + def domain_shutdown(self, domid, reason='poweroff'): """Shutdown domain (nicely). - poweroff: restart according to exit code and restart mode - reboot: restart on exit @@ -422,89 +326,13 @@ class XendDomain: @param id: domain id @param reason: shutdown type: poweroff, reboot, suspend, halt """ - dominfo = self.domain_lookup(id) - self.domain_restart_schedule(dominfo.getDomid(), reason, force=True) - eserver.inject('xend.domain.shutdown', [dominfo.getName(), - dominfo.getDomid(), reason]) - if reason == 'halt': - reason = 'poweroff' - val = dominfo.shutdown(reason) - if not reason in ['suspend']: - self.domain_shutdowns() - return val - - - def domain_sysrq(self, id, key): - """Send a SysRq to the specified domain.""" - return self.callInfo(id, XendDomainInfo.send_sysrq, key) - - - def domain_shutdowns(self): - """Process pending domain shutdowns. - Destroys domains whose shutdowns have timed out. - """ - timeout = SHUTDOWN_TIMEOUT + 1 - for dominfo in self.domains.values(): - if not dominfo.shutdown_pending: - # domain doesn't need shutdown - continue - id = dominfo.getDomid() - left = dominfo.shutdown_time_left(SHUTDOWN_TIMEOUT) - if left <= 0: - # Shutdown expired - destroy domain. - try: - log.info("Domain shutdown timeout expired: name=%s id=%s", - dominfo.getName(), id) - self.domain_destroy(id, reason= - dominfo.shutdown_pending['reason']) - except Exception: - pass - else: - # Shutdown still pending. - timeout = min(timeout, left) - if timeout <= SHUTDOWN_TIMEOUT: - # Pending shutdowns remain - reschedule. - scheduler.later(timeout, self.domain_shutdowns) + self.callInfo(domid, XendDomainInfo.shutdown, reason) - def domain_restart_schedule(self, id, reason, force=False): - """Schedule a restart for a domain if it needs one. - @param id: domain id - @param reason: shutdown reason - """ - log.debug('domain_restart_schedule> %d %s %d', id, reason, force) - dominfo = self.domain_lookup(id) - if not dominfo: - return - restart = (force and reason == 'reboot') or dominfo.restart_needed(reason) - if restart: - log.info('Scheduling restart for domain: name=%s id=%s', - dominfo.getName(), dominfo.getDomid()) - eserver.inject("xend.domain.restart", - [dominfo.getName(), dominfo.getDomid(), - "schedule"]) - dominfo.restarting() - else: - log.info('Cancelling restart for domain: name=%s id=%s', - dominfo.getName(), dominfo.getDomid()) - eserver.inject("xend.domain.restart", - [dominfo.getName(), dominfo.getDomid(), "cancel"]) - dominfo.restart_cancel() + def domain_sysrq(self, domid, key): + """Send a SysRq to the specified domain.""" + return self.callInfo(domid, XendDomainInfo.send_sysrq, key) - def domain_restarts(self): - """Execute any scheduled domain restarts for domains that have gone. - """ - doms = self.xen_domains() - for dominfo in self.domains.values(): - if not dominfo.restart_pending(): - continue - info = doms.get(dominfo.getDomid()) - if info: - # Don't execute restart for domains still running. - continue - # Remove it from the restarts. - log.info('restarting: %s' % dominfo.getName()) - self.domain_restart(dominfo) def domain_destroy(self, domid, reason='halt'): """Terminate domain immediately. @@ -517,7 +345,6 @@ class XendDomain: if domid == PRIV_DOMAIN: raise XendError("Cannot destroy privileged domain %i" % domid) - self.domain_restart_schedule(domid, reason, force=True) dominfo = self.domain_lookup(domid) if dominfo: val = dominfo.destroy() @@ -730,10 +557,15 @@ class XendDomain: ## private: def callInfo(self, domid, fn, *args, **kwargs): - self.refresh() - dominfo = self.domains.get(domid) - if dominfo: - return fn(dominfo, *args, **kwargs) + try: + self.refresh() + dominfo = self.domains.get(domid) + if dominfo: + return fn(dominfo, *args, **kwargs) + except XendError: + raise + except Exception, exn: + raise XendError(str(exn)) def instance(): diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py index 70c8755507..14410f9e6c 100644 --- a/tools/python/xen/xend/XendDomainInfo.py +++ b/tools/python/xen/xend/XendDomainInfo.py @@ -35,7 +35,9 @@ from xen.util.blkif import blkdev_uname_to_file from xen.xend.server.channel import EventChannel from xen.xend import image +from xen.xend import scheduler from xen.xend import sxp +from xen.xend import XendRoot from xen.xend.XendBootloader import bootloader from xen.xend.XendLogging import log from xen.xend.XendError import XendError, VmError @@ -43,7 +45,7 @@ from xen.xend.XendRoot import get_component from xen.xend.uuid import getUuid from xen.xend.xenstore.xstransact import xstransact -from xen.xend.xenstore.xsutil import IntroduceDomain +from xen.xend.xenstore.xsutil import GetDomainPath, IntroduceDomain """Shutdown code for poweroff.""" DOMAIN_POWEROFF = 0 @@ -75,9 +77,6 @@ restart_modes = [ RESTART_NEVER, ] -STATE_RESTART_PENDING = 'pending' -STATE_RESTART_BOOTING = 'booting' - STATE_VM_OK = "ok" STATE_VM_TERMINATED = "terminated" STATE_VM_SUSPENDED = "suspended" @@ -92,7 +91,29 @@ SIF_NET_BE_DOMAIN = (1<<5) SIF_TPM_BE_DOMAIN = (1<<7) +SHUTDOWN_TIMEOUT = 30 + + +DOMROOT = '/domain' +VMROOT = '/domain' + + xc = xen.lowlevel.xc.new() +xroot = XendRoot.instance() + + +## Configuration entries that we expect to round-trip -- be read from the +# config file or xc, written to save-files (i.e. through sxpr), and reused as +# config on restart or restore, all without munging. Some configuration +# entries are munged for backwards compatibility reasons, or because they +# don't come out of xc in the same form as they are specified in the config +# file, so those are handled separately. +ROUNDTRIPPING_CONFIG_ENTRIES = [ + ('name', str), + ('ssidref', int), + ('cpu_weight', float), + ('bootloader', str) + ] def domain_exists(name): @@ -145,22 +166,43 @@ class XendDomainInfo: vm = cls(getUuid(), dompath, cls.parseConfig(config)) vm.construct() + vm.refreshShutdown() return vm create = classmethod(create) - def recreate(cls, uuid, dompath, domid, info): - """Create the VM object for an existing domain. + def recreate(cls, xeninfo): + """Create the VM object for an existing domain.""" - @param dompath: The path to all domain information - @param info: domain info from xc - """ + log.debug("XendDomainInfo.recreate(%s)", xeninfo) - log.debug("XendDomainInfo.recreate(%s, %s, %s, %s)", uuid, dompath, - domid, info) + domid = xeninfo['dom'] + try: + dompath = GetDomainPath(domid) + if not dompath: + raise XendError( + 'No domain path in store for existing domain %d' % domid) + vmpath = xstransact.Read(dompath, "vm") + if not vmpath: + raise XendError( + 'No vm path in store for existing domain %d' % domid) + uuid = xstransact.Read(vmpath, "uuid") + if not uuid: + raise XendError( + 'No vm/uuid path in store for existing domain %d' % domid) + + dompath = "/".join(dompath.split("/")[0:-1]) + except Exception, exn: + log.warn(str(exn)) + dompath = DOMROOT + uuid = getUuid() + + log.info("Recreating domain %d, uuid %s", domid, uuid) - return cls(uuid, dompath, info, domid, True) + vm = cls(uuid, dompath, xeninfo, domid, True) + vm.refreshShutdown(xeninfo) + return vm recreate = classmethod(recreate) @@ -183,14 +225,12 @@ class XendDomainInfo: except TypeError, exn: raise VmError('Invalid ssidref in config: %s' % exn) - log.debug('restoring with ssidref = %d' % ssidref) - vm = cls(uuid, dompath, cls.parseConfig(config), xc.domain_create(ssidref = ssidref)) - vm.clear_shutdown() vm.create_channel() vm.configure() vm.exportToDB() + vm.refreshShutdown() return vm restore = classmethod(restore) @@ -214,33 +254,28 @@ class XendDomainInfo: log.debug("parseConfig: config is %s" % str(config)) result = {} - imagecfg = "()" - result['name'] = get_cfg('name') - result['ssidref'] = get_cfg('ssidref', int) + for e in ROUNDTRIPPING_CONFIG_ENTRIES: + result[e[0]] = get_cfg(e[0], e[1]) + result['memory'] = get_cfg('memory', int) result['mem_kb'] = get_cfg('mem_kb', int) result['maxmem'] = get_cfg('maxmem', int) result['maxmem_kb'] = get_cfg('maxmem_kb', int) result['cpu'] = get_cfg('cpu', int) - result['cpu_weight'] = get_cfg('cpu_weight', float) - result['bootloader'] = get_cfg('bootloader') result['restart_mode'] = get_cfg('restart') + result['image'] = get_cfg('image') try: - imagecfg = get_cfg('image') - - if imagecfg: - result['image'] = imagecfg - result['vcpus'] = int(sxp.child_value(imagecfg, 'vcpus', - 1)) + if result['image']: + result['vcpus'] = int(sxp.child_value(result['image'], + 'vcpus', 1)) else: result['vcpus'] = 1 except TypeError, exn: raise VmError( 'Invalid configuration setting: vcpus = %s: %s' % - (sxp.child_value(imagecfg, 'vcpus', 1), - str(exn))) + (sxp.child_value(result['image'], 'vcpus', 1), str(exn))) result['backend'] = [] for c in sxp.children(config, 'backend'): @@ -283,26 +318,26 @@ class XendDomainInfo: self.store_mfn = None self.console_channel = None self.console_mfn = None - - #todo: state: running, suspended + self.state = STATE_VM_OK self.state_updated = threading.Condition() - self.shutdown_pending = None - self.restart_state = None - self.restart_time = None - self.restart_count = 0 - self.writeVm("uuid", self.uuid) self.storeDom("vm", self.path) def augmentInfo(self): + """Augment self.info, as given to us through {@link #recreate}, with + values taken from the store. This recovers those values known to xend + but not to the hypervisor. + """ def useIfNeeded(name, val): if not self.infoIsSet(name) and val is not None: self.info[name] = val params = (("name", str), + ("restart-mode", str), + ("image", str), ("start-time", float)) from_store = self.gatherVm(*params) @@ -322,13 +357,18 @@ class XendDomainInfo: defaultInfo('name', lambda: "Domain-%d" % self.domid) defaultInfo('ssidref', lambda: 0) defaultInfo('restart_mode', lambda: RESTART_ONREBOOT) + defaultInfo('cpu', lambda: None) defaultInfo('cpu_weight', lambda: 1.0) defaultInfo('bootloader', lambda: None) defaultInfo('backend', lambda: []) defaultInfo('device', lambda: []) + defaultInfo('image', lambda: None) self.check_name(self.info['name']) + if isinstance(self.info['image'], str): + self.info['image'] = sxp.from_string(self.info['image']) + # Internally, we keep only maxmem_KiB, and not maxmem or maxmem_kb # (which come from outside, and are in MiB and KiB respectively). # This means that any maxmem or maxmem_kb settings here have come @@ -451,17 +491,16 @@ class XendDomainInfo: 'domid': str(self.domid), 'uuid': self.uuid, - 'restart_time': str(self.restart_time), - - 'xend/state': self.state, - 'xend/restart_count': str(self.restart_count), 'xend/restart_mode': str(self.info['restart_mode']), 'memory/target': str(self.info['memory_KiB']) } for (k, v) in self.info.items(): - to_store[k] = str(v) + if v: + to_store[k] = str(v) + + to_store['image'] = sxp.to_string(self.info['image']) log.debug("Storing %s" % str(to_store)) @@ -513,6 +552,88 @@ class XendDomainInfo: self.info['backend'], 0) + def refreshShutdown(self, xeninfo = None): + if xeninfo is None: + xeninfo = dom_get(self.domid) + if xeninfo is None: + # The domain no longer exists. This will occur if we have + # scheduled a timer to check for shutdown timeouts and the + # shutdown succeeded. + return + + if xeninfo['dying']: + # Dying means that a domain has been destroyed, but has not yet + # been cleaned up by Xen. This could persist indefinitely if, + # for example, another domain has some of its pages mapped. + # We might like to diagnose this problem in the future, but for + # now all we can sensibly do is ignore it. + pass + + elif xeninfo['crashed']: + log.warn('Domain has crashed: name=%s id=%d.', + self.info['name'], self.domid) + + if xroot.get_enable_dump(): + self.dumpCore() + + self.maybeRestart('crashed') + + elif xeninfo['shutdown']: + reason = shutdown_reason(xeninfo['shutdown_reason']) + + log.info('Domain has shutdown: name=%s id=%d reason=%s.', + self.info['name'], self.domid, reason) + + self.clearRestart() + + if reason == 'suspend': + self.state_set(STATE_VM_SUSPENDED) + # Don't destroy the domain. XendCheckpoint will do this once + # it has finished. + elif reason in ['poweroff', 'reboot']: + self.maybeRestart(reason) + else: + self.destroy() + + else: + # Domain is alive. If we are shutting it down, then check + # the timeout on that, and destroy it if necessary. + + sst = self.readVm('xend/shutdown_start_time') + if sst: + sst = float(sst) + timeout = SHUTDOWN_TIMEOUT - time.time() + sst + if timeout < 0: + log.info( + "Domain shutdown timeout expired: name=%s id=%s", + self.info['name'], self.domid) + self.destroy() + else: + log.debug( + "Scheduling refreshShutdown on domain %d in %ds.", + self.domid, timeout) + scheduler.later(timeout, self.refreshShutdown) + + + def shutdown(self, reason): + if not reason in shutdown_reasons.values(): + raise XendError('invalid reason:' + reason) + self.storeVm("control/shutdown", reason) + if not reason in ['suspend']: + self.storeVm('xend/shutdown_start_time', time.time()) + + + def clearRestart(self): + self.removeVm("xend/shutdown_start_time") + + + def maybeRestart(self, reason): + if self.restart_needed(reason): + self.restart() + else: + self.destroy() + + def dumpCore(self): """Create a core dump for this domain. Nothrow guarantee.""" @@ -526,18 +647,32 @@ class XendDomainInfo: self.domid, self.info['name'], str(exn)) - def closeStoreChannel(self): - """Close the store channel, if any. Nothrow guarantee.""" + def closeChannel(self, channel, entry): + """Close the given channel, if set, and remove the given entry in the + store. Nothrow guarantee.""" try: - if self.store_channel: - try: - self.store_channel.close() - self.removeDom("store/port") - finally: - self.store_channel = None + try: + if channel: + channel.close() + finally: + self.removeDom(entry) except Exception, exn: log.exception(exn) + + + def closeStoreChannel(self): + """Close the store channel, if any. Nothrow guarantee.""" + + self.closeChannel(self.store_channel, "store/port") + self.store_channel = None + + + def closeConsoleChannel(self): + """Close the console channel, if any. Nothrow guarantee.""" + + self.closeChannel(self.console_channel, "console/port") + self.console_channel = None def setConsoleRef(self, ref): @@ -566,18 +701,23 @@ class XendDomainInfo: self.info.update(info) self.validateInfo() + self.refreshShutdown(info) log.debug("XendDomainInfo.update done on domain %d: %s", self.domid, self.info) + ## private: + def state_set(self, state): self.state_updated.acquire() if self.state != state: self.state = state self.state_updated.notifyAll() self.state_updated.release() - self.exportToDB() + + + ## public: def state_wait(self, state): self.state_updated.acquire() @@ -585,6 +725,7 @@ class XendDomainInfo: self.state_updated.wait() self.state_updated.release() + def __str__(self): s = "